/*******************************************************************************
*File name: sipp_family_income_to_individual_income_ratios.do

*Description: 
*Overview
This is based on the do-files that created Appendix Table A2, which was written 
on SIPP Synthetic Beta and run on SIPP Gold. 
Note that the variable names are very different in the SIPP Synthetic Beta as 
compared to the public use files. 

We use SIPP to calculate the ratios of family income to individual income 
at the various BPs to adjust the elasticities. 
*******************************************************************************/


/*******************************************************************************
** SET ENVIRONMENT
*******************************************************************************/

clear all
set more off

** Working directory

global path "<insert file path>" /*UPDATE THE PATH*/

local workdir "${path}"
cd "`workdir'"

** Set global macros
global version "v1"
global current_date: display %td_CCYYNNDD date(c(current_date), "DMY")
global current_date = strtrim("${current_date}")

** install gtools to speed up egen. gegen will replace egen.
//ssc install gtools
//gtools, upgrade

/*******************************************************************************
** LOAD TOPICAL MODULE WAVE 2 -- FERTILITY HISTORY
** Just need to add a new variables here to prepare for a data merge later
*******************************************************************************/

local years 1996 2001 2004 2008
foreach year in `years' {
	
	** load the SIPP topical module wave 2
	use "data/`year'/`year'tm2.dta", clear // always wave 2 here

	gen srefmon = 4 // Must create a variable for the reference month and set it equal to 4
					// only records for month four are included in the topical module files
	
	tempfile `year'tm2_v2
	save ``year'tm2_v2', replace
	
}
					 
/*******************************************************************************
** LOAD CORE WAVES
*******************************************************************************/

clear all 

foreach year in 1996 2001 2004 2008 {
	
if (`year' == 1996) local yr 96
if (`year' == 1996) local wave 12
																	
if (`year' == 2001) local yr 01
if (`year' == 2001) local wave 9

if (`year' == 2004) local yr 04
if (`year' == 2004) local wave 12

if (`year' == 2008) local yr 08
if (`year' == 2008) local wave 16

																
** added a bunch of new variables compared to the first rund of analysis
forv i=1/`wave' {
	local v1 "ssuseq ssuid spanel swave srotaton srefmon rhcalmn rhcalyr lgtmon rfid" // survey structure related
	local v2 "shhadid epppnum ehrefper *wgt lgtkey" // person, HH identifier, weights
	local v3 "eresnss* rcutyp03 rcutyp04 rcutyp20 er27 rcuown27 rcutyp27 rcutyp57 er04 t20amt t27amt a27amt tfsocsec" // safety net
	local v4 "tage aage" // age variabels
	local v5 "aresnss2 aresnss1 ar27 acdmth ar04" // allocation flags
	local v6 "er01a ar01a t01amta t01amtk a01amta t03amta a03amta t03amtk a03amtk t04amt a04amt t38amt a38amt" 
	local v7 "esex erace arace eorigin aorigin etypmom ebmnth tbyear esfr errp tagess aagess ems ams eeducate"
	local v8 "eeducate tptotinc tpearn  rcutyp58  ecrmth ehemply ahemply  t08amt a20amt t10amt a10amt thtotinc thsocsec"
	local v9 "epnspous apnspous esfrfper esfspse esfkind epnmom epndad"
	local vars "`v1' `v2' `v3' `v4' `v5' `v6' `v7' `v8' `v9'"
	
	use `vars' using "data/`year'/`year'w`i'.dta", clear 
	tempfile `yr'w`i'
	save ``yr'w`i'', replace
}


** append all
use ``yr'w1', clear
quietly forv i=2/`wave' {
	append using ``yr'w`i''
}

** household identifier
** family identifier see 2008 user guide page 225. 
gegen hh_id = group(spanel ssuid shhadid) 

** (primary) family identifier 
gegen fam_id = group(spanel ssuid shhadid rfid) // all family members who are related and living together

** person identifier 
gegen person_id = group(spanel ssuid shhadid epppnum) 
cap gen all = 1

** calendar month/year for reference month
gen yearmo = ym(rhcalyr, rhcalmn) // this is easier to work with than "calyrmo" created below
format yearmo %tm

** day/month/year
gen yearmod = mdy(rhcalmn, 1, rhcalyr) // first date of month/year
format yearmod %td 

sort fam_id person_id yearmo


/*******************************************************************************
** COMBINE SPOUSAL INCOME
*******************************************************************************/


** produce family income figures by adding in spousal income
destring epppnum, generate(epppnum_n) // convert person number to number

sort hh_id person_id yearmo

gen w_spouse = epnspous != . & apnspous != 3

** hh_id small large 
gen flag_size = epppnum_n > epnspous 
tab flag_size, missing

egen pid_spouseid = concat(hh_id epppnum_n epnspous), p("_") 
egen pid_spouseid2 = concat(hh_id epnspous epppnum_n), p("_") 

gen str20 spouse_unit = pid_spouseid if flag_size == 0 
replace spouse_unit = pid_spouseid2 if flag_size == 1

isid yearmo person_id spouse_unit

tempfile sipp_`year'
save `sipp_`year'', replace

** get the added spousal income
collapse (sum) tptotinc, by(yearmo spouse_unit)

rename tptotinc com_spousal_inc 

label var com_spousal_inc "combined spousal income"

tempfile `year'_spousal_inc
save ``year'_spousal_inc', replace

** merge the combined spousal income back into the original dataset
use `sipp_`year'', clear

merge m:m yearmo spouse_unit using ``year'_spousal_inc', keepusing(com_spousal_inc) 
drop _merge

/*******************************************************************************
** DATES
construct sipp_panel_beg_date and sipp_panel_end_date in Synthetic SIPP Data
********************************************************************************/

gen birthdate = mdy(ebmnth, 1, tbyear) 
format birthdate %td

sort person_id yearmo
by person_id, sort: gen first_appearance = _n == 1 // a person's first appearance

gen sipp_panel_beg_date_1 = yearmod if first_appearance == 1 

gegen sipp_panel_beg_date = max(sipp_panel_beg_date_1), by(person_id) // copy down
format sipp_panel_beg_date %td
				 
gsort person_id -yearmo
by person_id, sort: gen last_appearance = _n == 1 // a person's first appearance
		
gen sipp_panel_end_date_1 = mdy(rhcalmn, 30, rhcalyr) if last_appearance == 1 
			
gegen sipp_panel_end_date = max(sipp_panel_end_date_1), by(person_id) // copy down
format sipp_panel_end_date %td			

** only keep answers to the most recent month
keep if srefmon == 4 // dealing with seam bias, keep only reporting month observations. 
					 // Treat the data longitudinally as 4 month snap shots

				 
/*******************************************************************************
** MERGE WITH SIPP TOPICAL MODULE FOR FERTILITY HISTORY
********************************************************************************/
	

/* Based on H. Luke Shaefer SIPP slides
We can merge topical module content into the core using person/household identifier and wave
TM observations generally attach to the 4th reference month of the wave they were conducted in
*/

** merge data from the second topical module to the full panel file.  
** The second topical module covers fertility history
** https://www.census.gov/programs-surveys/sipp/tech-documentation/topical-modules.html
** all records from the topical module are matched to the core wave.
local fertility_history tfrchl afrchl tmomchl amomchl emomlivh tfbrthyr tlbirtyr efblivnw elblivnw 
merge 1:1 ssuid epppnum swave srefmon using ``year'tm2_v2', keepusing(`fertility_history') 

describe `fertility_history' 
		  		 
/* last_admin_birthdate
https://www2.ncrn.cornell.edu/ced2ar-web/codebooks/ssb/v/v602/vars/last_admin_birthdate

This variable contains the administrative birthdate for the last biological child. The universe for this variable is all women between the ages of 15 and 65 at the time of the fertility history topical module. 

This variable was created by first looking for biological children on the SIPP household roster and choosing the birthdate of the youngest child (if no children were found, the value was set to missing). 

The total number of biological children reported on the roster (could possibly be zero) was then compared to the woman's report in her fertility history about the number of children born to her. If the number born to her was larger, last_admin_birthdate was set to missing and imputed using the woman's report of the last year she gave birth as a predictor variable. This process allowed us to create a fertility history that was consistent with the children reported on the roster and their administrative birthdates but still handle cases where older children or all children lived outside the household.
*/

/* All females aged 15-65 and the respondent is pointed to as the biological mother of a 
child in the household and she has one or more children at the time of the fertility history topical module */

gen etypmom_1 = etypmom == 1 // pick the the biological child in the family
gegen etypmom_fam = max(etypmom_1), by(fam_id) // copy down, with a biological child in the family
drop etypmom_1

gen women_universe = inrange(tage, 15, 65) & esex == 2 & etypmom_fam == 1 & tmomchl >= 1 & tmomchl != . & _merge == 3 // only for one record per woman in the unverse
gegen women_universe_p = max(women_universe), by(person_id) // copy down to all the person's records
gegen women_universe_fam = max(women_universe), by(fam_id) // copy down to family

** use the first day of the birth month/year
gen biological_child_birthday = mdy(ebmnth, 1, tbyear) if etypmom == 1 & women_universe_fam == 1 // only keep the bdays of biological children of women in the universe
format biological_child_birthday %td

** birthday of the last biological child
gegen last_admin_birthdate = min(biological_child_birthday), by(fam_id)
replace last_admin_birthdate = . if women_universe_p != 1. // recode to missing for folks outside the universe
format last_admin_birthdate %td
sum last_admin_birthdate

** order and sort data so it's easier to browse
order fam_id hh_id person_id yearmo tage esex esfr errp 
sort fam_id person_id yearmo

** calculate if there are any dependents in the family for female spouses

* NOTE: last_admin_birthdate = birthdate of youngest child if observation female, 
* 15-65 during fertility history module (which was asked in all panels)

gen youngest_age = (sipp_panel_beg_date-last_admin_birthdate)/365.25 // will be missing if male or didn't have a child, age in years otherwise

replace youngest_age = . if youngest_age < 0 // children born after the fertility history module

gen hasdepkids = 1 if youngest_age<18 // will be missing if male, didn't have a child, or child was over 18; 1 otherwise

* apply this determination to both males and females in the family
bysort fam_id: egen hasdep = min(hasdepkids) // if female spouse has hasdepkids==1, then will set this for male spouse as well


/********************************************************************************
** another definition of has dependents
********************************************************************************/


/* Do you think it will be reasonable to count a DI beneficiary as having a dependent if s/he has a child under 18 co-residing with him/her at the time of the SIPP survey? */


* Identify children under 18, When you qualify for Social Security disability benefits, your children may also qualify to receive benefits on your record. Your eligible child can be your biological child, adopted child, or stepchild. A dependent grandchild may also qualify. https://www.ssa.gov/benefits/disability/family.html#anchor3
//gen byte children = 1 if tage < 18 & (epnmom != 9999 | epndad!= 9999)

count if tage < 18 & inrange(ems, 1, 5)

gen byte children = tage < 18 & ems == 6

gegen hasdep2 = max(children), by(fam_id yearmod) 
replace hasdep2 = . if hasdep2 == 0

compare hasdep hasdep2

egen nkids2 = sum(children), by(fam_id yearmod) 
la var nkids2 "number of kids"

/*******************************************************************************
** Data Restriction 4: age restrictions in SIPP gold
** Restriction should probably be 21-59 at the start of the sample period
** Restriction 2 and 3 follow 4 because I need to apply the age restriction first
********************************************************************************/

*age
gen sipp_beg_age = (sipp_panel_beg_date - birthdate)/365.25 
gen sipp_end_age = (sipp_panel_end_date - birthdate)/365.25

*panel years
gen panel_startyr = year(sipp_panel_beg_date)
gen panel_finyr = year(sipp_panel_end_date)

** Restriction should probably be 21-59 at the start of the sample period
gegen sipp_beg_age_p = min(sipp_beg_age), by(person_id) // copy the sipp_beg_age down to all records of the person

keep if inrange(sipp_beg_age_p, 21, 59) // although appendix table A2 age restrictions seem to apply to 21 and 61

gegen sipp_end_age_p = max(sipp_end_age), by(person_id)

/*******************************************************************************
** Data Restriction 2
********************************************************************************/

/*Data Restriction 2: disability case not reduced for age
NOTE: We want to drop this group altogether from the sample
NOTE2: each of benefit values are indicators for receiving benefits conditional on applying, missing o/w */

tab aresnss1 eresnss1, missing

gen rest1 = . // indicator if receiving retirement/survivors insurance while under age 60
replace rest1 = 1 if (eresnss1 == 1 | eresnss2 == 1) & sipp_end_age_p < 60 // receiving SSI, retired, while under 60
replace rest1 = 1 if (eresnss1 == 3 | eresnss2 == 3) & sipp_end_age_p < 60 & sipp_end_age_p > 18 // receiving SSI, widowed, under 60
la var rest1 "if receiving retirement/survivors insurance while under age 60"

tabstat rest1, stat(sum)

drop if rest1 == 1

/********************************************************************************
** Data Restriction 3
** Apply the restriction similar to the first round of SIPP analysis:
** remove anyone who reported SSDI coverage for only 1 or 2 waves of the panel. 
********************************************************************************/

** We can't apply this restriction exactly. 
** below comment is from the Census' code.
/*Data Restriction 3: there is a single date of filing
NOTE1: Since we do not observe date of filing, we instead restrict to cases 
	where there's only one occurrence of eligibility. 
NOTE2: If never applied, then kept in the "non-SSDI" group; if applied multiple times
	or applied out of order, then drop from the sample altogether. For the last two cases, 
	I create a flag here; I summarize and drop immediately before the summary stats 
	so we will know the number of observations as a data quality measure. */	

gen ssdi_mo = (er01a==1) & (ar01a==0) // receive income from social security, not imputed
//putting in the age restriction pretty much isolates disability recipients

** total number of waves that a person received SSDI
by spanel ssuid shhadid epppnum swave, sort : gen n1 = _n == 1 if ssdi == 1 
	// mark the first ssdi==1 within the wave as 1, and 0 for the rest values
replace n1 = 0 if n1 == .

gegen n_waves_ssdi = total(n1), by(person_id)
label variable n_waves_ssdi "Number of waves reporting SSDI coverage"

drop if n_waves_ssdi == 1 | n_waves_ssdi == 2
drop ssdi


/********************************************************************************
** Identifying SSDI payments, SSDI, and SSI
********************************************************************************/


/*** Restrictions from the paper, following the admin data sample construction ****
NOTE: Tim's file enforces 4 restrictions, but we can only enforce two of them on 
the SSB data. */

** we can't construct the SSI variable the same way census does based on the SIPP gold because it's from the admin variables
** I use the same definition for the first round analysis

** month-person variables. varies by month
gen ssdi = (er01a==1) & (ar01a==0) // updated on 08/11/2021 

gen ssi = inlist(1, rcutyp03, rcutyp04)					// month-person, get SSI (federal and state)
replace ssi = 0 if aresnss1 != 0 						// aresnss1: allocation flag for eresnss1
replace ssi = 0 if aresnss2 != 0 						// aresnss2: allocation flag for eresnss2
replace ssi = 0 if ar04 != 0 & er04 == 1				// ar04: allocation flag for ero4 (state SSI)
		
tabstat ssdi ssi, stat(sum)		
		
label variable ssdi "SSDI monthly participation flag"
label define ssdi 0 "No" 1 "Yes" 
label values ssdi ssdi	

label variable ssi "SSI monthly participation flag"
label define ssi 0 "No" 1 "Yes" 
label values ssi ssi	

gegen ssi_ever = max(ssi), by(person_id)    // ever receive ssi during the panel period, time invariant

/*******************************************************************************
** reduce to one person per panel structure
** keep the first month where there is reported SSI or SSDI coverage.
*******************************************************************************/

/* identify PIA variables
As soon as we restrict to people with one date of eligibility, all people have 
only one PIA value, which is stored in mbr_ssdi_benefit_totamt_1 (eligible), 
phus_ssdi_benefit_totamt_1 (paid). Moreover, the eligibility determination date 
is currently stored in mbr_ssdi_doed_1 (MBR SSDI: Date of entitlement to disability). */


/*
We want to keep one person per pernal. Which person month to use?
for sipp, for income, we incline to use earlier observations because ppl are more serious when then first the survey first appears

use the first reference month, the first time ssdi recipients appeared in SIPP with ssdi coverage

SIPP asked about age social security disability payments began (tagess). most of them began sipp coverage before the sipp panel. For example, a person reported the first payment date in Jan 2001. and in the 2004 panel, the person reporting DI coverage in Jan 2004. 3 years after his reported SIPP coverage. in this case, we will just use his reported DI payment amount in Jan 2004 -- just the social security payment as the DI variable (t01amta). 

only exception is that if the person reports DI coverage in Jan 2004, and he also says that's the first month he gets DI. then look for the next wave that he reports DI coverage, and then use that DI income variable as DI payment. 
the reason is because DI takes a long time to process you can start applying and get the payment a year after (back payment). it's often a lumsum payment. we don't want to use that to proxy how much the person gets. look at the next wave in this case 
*/

** most ppl had ssdi coverage before the panel 
compare tage tagess if ssi == 1

** in a few observations, tagess is not in the universe in one wave but in the universe in another wave in the panel
** drop tagess not in the universe or tagess is imputed
drop if tagess  == -1 | aagess != 0

** if the person report first time ssdi receipient in this panel
** only a few hundrend records
gen concurrent_flag = tage == tagess & aagess == 0 & ssdi == 1 // ??? ssi or ssdi here

** for those report SSDI coverage before the panel
** keep the first month where there is reported SSI or SSDI coverage.
gen date_dup2 = yearmo if ssdi == 1 & concurrent_flag != 1 // duplicate date variable, only limited to ssi/ssdi or both covered

sort person_id											 
by person_id: gegen first_month_reporting = min(date_dup2) // first reported ssdi coverage 
format first_month_reporting %tm

gen keep_flag = yearmo == first_month_reporting // we'll only have one record per person
drop date_dup2

** In a few cases, the ssdi payment for the same person vary quite a bit -- likely retroactive payments
** since the eligibility date is just a proxy. in stead of looking for the next wave, just keep the min ssdi payment for ssdi receipients. that being said, when you look at the data, they are not super different. 

gen dup_t01amta = t01amta if ssdi == 1 & concurrent_flag == 1 // only for folks that report first ssdi payment in the panel

sort person_id t01amta
by person_id, sort: gen min_ssdi_payment = _n == 1  // just keep the record with min payments

** keep the first SSDI appearance if the reported ssdi coverage before the panel, or first ssdi coverage in the panel, then choose the min ssdi payment
keep if keep_flag == 1 | (concurrent_flag == 1 & min_ssdi_payment == 1)

** there are a few cases where tagess is not consistent in SSDI receipient's response. They said they got SSDI initial coverage at different ages in different waves. This causes the discrepancy between keep_flag and min_ssdi_payment. In this case, use the keep_flag one because it has a more recent value which is less likely to be a back payment. At the end of the day, it doesn't really matter much since the # of such case is quite small and the t01amta values are similar for such cases.
sort person_id
quietly by person_id:  gen dup = cond(_N==1,0,_n)

tab dup 

drop if dup != 0 & keep_flag != 1

** at this point, each person should only have one record. check
isid person_id 

** get rid of variables we created and we don't need anymore
drop first_month_reporting keep_flag dup_t01amta min_ssdi_payment dup


/********************************************************************************
** create proxy for eligibility date and payment start date
********************************************************************************/


**generate dates of eligibility

/* mbr_ssdi_doed, MBR SSDI: 
The date at which the disabled individual became entitled to disability insurance 
benefits for the associated SSDI application. This date is stored as a SAS date variable. 
See mbr_ssdi_applied_k for details on how individual applications are recorded on this file.
*/

** NOTE: we don't have the eligibility and payment dates in sipp public. 
** SIPP asked the age they began receiving SSID. We'll use it as a proxy. use it as a proxy for the eligibility and payment dates.

gen mbr_ssdi_doed_1 = mdy(ebmnth, 1, tbyear+tagess) if aagess == 0 // only use it when tagess is not imputed
label var mbr_ssdi_doed_1 "Date of entitlement to disability, proxy"
format mbr_ssdi_doed_1 %td

*generate dates of eligibility
gen pia_year = year(mbr_ssdi_doed_1) 		// year of entitlement to disability
gen pia_month = month(mbr_ssdi_doed_1) 		// month of entitlement to disability
//gen pia_yyyymm = pia_year*100 + pia_month // don't copy census code directly. the result is funny
** calendar month/year 
gen pia_yyyymm = ym(pia_year, pia_month) 
format pia_yyyymm %tm

tab pia_year, missing
count if pia_year < 1992 // how many have pia_year before 1992, the lower bound year

** again, use the declared age receiving SSDI as the age that the person became eligibile. 
gen age_elig = tagess if aagess == 0
gen age_filing = age_elig // proxy for "age when application was filed"
la var age_elig "Age declared eligible for SSDI"

** mbr_ssdi_benefit_totamt_1, is the monthly amount of SSDI benefit. This is from the admind ata in SIPP synthetic Beta.
** Benefit amounts are calculated based on the worker's average indexed monthly earnings (AIME). 
** here just use the amount of social security (adult) received for self in this month (not imputed)
gen mbr_ssdi_benefit_totamt_1 = t01amta if a01amta == 0 & ssdi == 1 
label var mbr_ssdi_benefit_totamt_1 "monthly amount of SSDI benefit"

gen pia_nom = mbr_ssdi_benefit_totamt_1 // nominal pia
** NOTE: in the 2001 panel, a person has t01amta of $18,000. It's so high and skew the CDF of PIA a lot.
** likely a data entry error? Not exactly sure. Drop it here. 
** 1996 panel also has a pia_nom slightly about 7000 in 1995. inflation adjusted pia goes above 10000. cdf is ugly. unlikely an error
drop if pia_nom >= 15000 & pia_nom != .

** payment start date

/*phus_ssdi_benefit_stdate_1
This start date variable tells when the first SSDI benefit payment was recorded in the Payment History Update System, the administrative database maintained by SSA to track actual payments made to beneficiaries. This start date can differ from the MBR start date which records only eligibility and not actual payments. The PHUS began in 1984 and hence the earliest possible start date is January 1984. The latest possible start date is December 2012.*/
* Add variables for when first payments are made
**  For each group, keep the last month where there is reported SSI or SSDI coverage.

gen phus_ssdi_benefit_stdate_1 = mbr_ssdi_doed_1
format phus_ssdi_benefit_stdate_1 %td
label var phus_ssdi_benefit_stdate_1 "SSDI benefit start date, proxy"


/********************************************************************************
** Cost of Living Adjustment (COLA)
** https://www.ssa.gov/cola/
********************************************************************************/


*identify the Indexed Monthly Earnings (IME) year from the Primary Insurance Amount (PIA) yyyymm
* the IME year starts from 1992 and ends in 2012. some people have pia year goes back in the 80s or earlier
gen imeyear = .

forv i = 1992/2012 { // this is the lower bound year and the imeyear
	loc j = `i' + 1 // this is the upper bound year
	replace imeyear=`i' if ym(`i', 6)<=pia_yyyymm & pia_yyyymm<= ym(`j',5)
	// observation is in a year `i' if between June of that year, May of next
	// adpated from Census code
}

compare pia_yyyymm imeyear

tab imeyear, missing // years before 1992 is missing. roughly 300

*create index for the national wage index and CPI. I'll "merge" using locals and 
*a loop instead of adding an extra dataset.

*make empty variables to replace into
gen awi_year=.  	// Average Wage Index year
gen wage_index=. 	// wage index
gen cola=.			// cost of living adjustments
gen cola_cum=.

*now make arrays, index for them
loc imeyear_array 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
loc awi_year_array 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
loc wage_index_array 21027.98 21811.6 22935.42 23132.67 23753.53 24705.66 25913.9 27426 28861.44 30469.84 32154.82 32921.92 33252.09 34064.95 35648.55 36952.94 38651.41 40405.48 41334.97 40711.61 41673.83
loc cola_array 3 2.6 2.8 2.6 2.9 2.1 1.3 2.5 3.5 2.6 1.4 2.1 2.7 4.1 3.3 2.3 5.8 0 0 3.6 1.7
loc cola_cum_array 1.683504961 1.634470836 1.593051497 1.549660989 1.510390828 1.467823934 1.437633628 1.419184233 1.384569983 1.337748776 1.303848709 1.285846853 1.259399465 1.226289644 1.177991973 1.14036009 1.114721496 1.053612 1.053612 1.053612 1.017
loc yearslength: word count `imeyear_array'

forv i=1/`yearslength' { //think of this as picking out the 1x5 data vector for each year
*store each number in seperate locals
loc imeyear: word `i' of `imeyear_array'
loc awi_year: word `i' of `awi_year_array'
loc wage_index: word `i' of `wage_index_array'
loc cola: word `i' of `cola_array'
loc cola_cum: word `i' of `cola_cum_array'

quietly replace awi_year=`awi_year' if imeyear==`imeyear'
quietly replace wage_index=`wage_index' if imeyear==`imeyear'
quietly replace cola=`cola' if imeyear==`imeyear'
quietly replace cola_cum=`cola_cum' if imeyear==`imeyear'
}

/*Deflating the PIA on the later six months based only on the CPI increase for that year, 
because PIA is increased even though AIME stays the same*/

gen pia_step = .
replace pia_step = pia_nom if 6<=pia_month & pia_month<=11
replace pia_step = pia_nom/(1+(cola/100)) if ~(6<=pia_month & pia_month<=11)

* Converting to 2013 CPI values
gen pia = pia_step*cola_cum 

compare pia pia_nom

compare pia pia_nom if pia_nom != . & pia == . // roughtly 300 observations with raw pia_nom, but pia is missing. because pia_yyyy is before 1992

* Add variables for when first payments are made
gen startpay_year = year(phus_ssdi_benefit_stdate_1)
gen startpay_month = month(phus_ssdi_benefit_stdate_1)
//gen startpay_yyyymm = startpay_year*100 + startpay_month
gen startpay_yyyymm = ym(startpay_year, startpay_month)
gen startpay = phus_ssdi_benefit_stdate_1

format startpay_yyyymm %tm
format startpay %td

/***** identify bend points using the PIA distribution from the admin data
For each panel, we have bend points at the following points in the cdf:
	LBP: 0.0055-0.1184		// lower bend point
	FMBP: 0.1294-0.4756  	// family maximum bend point
	UBP: 0.7658-0.9156   	// upper bend point
*/

sort pia

sum pia, detail
tabstat pia, stat(min median max)

gen year = spanel 

qui levelsof year // handy way to grab the four years
foreach yr in `r(levels)' { 
	// cumul pia if year==`yr' & ~mi(pia) & ssi==0, g(dist_`yr') eq  // the original code
	cumul pia if year==`yr' & ~mi(pia) & ssdi==1, generate(dist_`yr') equal  // cumulative distribution
	// estimate empirical cdf of PIA for each panel, store in dist_`yr'
	
	*now identify the bend points
	gen l_`yr' = 1 if  0.0055<= dist_`yr' & dist_`yr'<=0.1184	// lower bend points
	gen u_`yr' = 1 if  0.7658<= dist_`yr' & dist_`yr'<=0.9156	// upper bend points
	gen f_`yr' = 1 if  0.1294<= dist_`yr' & dist_`yr'<=0.4756 & hasdep2==1 // family-maximum bend points, with 1+ dependents
}

tempfile `year'_table_a2
save ``year'_table_a2', replace

} // end of year loop

/*******************************************************************************
** APPEND DATA
*******************************************************************************/


** pool the panels
use `1996_table_a2', clear
append using `2001_table_a2'
append using `2004_table_a2'
append using `2008_table_a2'

cap label drop spanel

gen group_bp1 = 1 if inlist(1, l_1996, l_2001, l_2004, l_2008) // =1 if in the LBP part of the distribution
gen group_bp2 = 1 if inlist(1, u_1996, u_2001, u_2004, u_2008) // upper bend point
gen group_bpf = 1 if inlist(1, f_1996, f_2001, f_2004, f_2008) // family maximum bend point

** dummy variables
gen all_ssdi = (pia>0 & ~mi(pia)) // all DI recipients
gen all_no_ssdi = all_ssdi==0 //all non-DI recipients
gen ssdi_no_ssi = (all_ssdi==1 & ssi==0) // SSDI recipients, no SSI
gen ssi_no_ssdi = (all_ssdi==0 & ssi==1) // SSI recipients, no SSDI
gen ssi_and_ssdi = (all_ssdi==1 & ssi==1) // SSI and SSDI recipients
gen no_ssi_no_ssdi = (all_ssdi==0 & ssi==0) // no SSI, no SSDI
gen all_ssi =  ssi==1

/********************************************************************************
** Prep for Summary Stats
********************************************************************************/

** Education, 5 categories
gen educ_5cat = .
replace educ_5cat = 1 if inrange(eeducate, 31, 38) // no high school diploma
replace educ_5cat = 2 if eeducate == 39 // high school graduate
replace educ_5cat = 3 if inrange(eeducate, 40, 43) // some college
replace educ_5cat = 4 if eeducate == 44 // college degree
replace educ_5cat = 5 if eeducate >= 45 & eeducate < . // grad degree

** education dummies
gen nohs = educ_5cat == 1 
gen hsdeg = educ_5cat == 2 
gen somecoll = educ_5cat == 3
gen colldeg = educ_5cat == 4
gen graddeg = educ_5cat == 5
	
*enter CPI-U-RS into the table

gen cpi_urs_2013 = .

*I have the full series in case we need to adjust later 
loc cpi_yrlist 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
loc cpi_vals 0.292226768 0.305084746 0.334015196 0.371127995 0.406779661 0.43132671 0.449444769 0.468439509 0.484219755 0.493278784 0.509643483 0.528345996 0.551139684 0.578316774 0.599357101 0.614260666 0.629748685 0.64289889 0.658386908 0.675920514 0.690531853 0.700175336 0.714786674 0.739041496 0.759789597 0.772063121 0.789888954 0.810929281 0.838398597 0.86528346 0.890122735 0.92402104 0.921098773 0.936294565 0.965517241 0.985680888 1

loc cpi_totyrs: word count `cpi_yrlist'

forv i=1/`cpi_totyrs'{
	loc cpiyr: word `i' of `cpi_yrlist'
	loc cpival: word `i' of `cpi_vals'
	replace cpi_urs_2013 = `cpival' if year == `cpiyr'
}

*generate indicators for the different years
foreach yr in 1996 2001 2004 2008 {
	gen yr`yr' = 1 if year == `yr'
}

** we did this part in script part 1
*for all variables observed in a series, generate variable that holds value for 
*  the first variable observed in the panel year (eg for yearly, panel year 
*  variable; for monthly, first variable in panel year).
gen totinc = tptotinc // Total Personal Income
gen totearn = tpearn // Total Earnings
gen fsamt = t27amt if a27amt == 0 // fsamt_M, SNAP/Food Stamps Amount Received
gen hicov = (rcutyp58 == 1 | rcutyp57 == 1 | ecrmth == 1) // Health Insurance Coverage, including medicaid, medicare, or other health ins
gen hiemp = (ehemply == 1 | ehemply == 2) & ahemply == 0 // Health Insurance Coverage from Employer (current or former)
gen afdcamt = t20amt // Amount of AFDC received
gen vcamt = t08amt if a20amt == 0 // Amount of veterans compensation or pension benefit
gen wcamt = t10amt if a10amt == 0 // Amount of Workers Compensation Received

** we don't have these variables. 
gen totearn_ser = . // Person-level annual earnings that were taxed by FICA
gen total_der_fica = . // Total earnings from FICA-covered jobs
gen total_der_nonfica = . // Total earnings from all non-FICA jobs

cap label drop spanel
rename spanel panel

qui levelsof panel

gen own_home = .
gen db_pension = .
gen dc_pension = .
gen pension_in_scope_empl = .
gen life_ins_1 = .
gen nonhouswealth = .
gen homeequity = .
gen totnetworth = .

loc nom_lhs nonhouswealth homeequity totnetworth com_spousal_inc totinc totearn totearn_ser total_der_fica total_der_nonfica fsamt afdcamt vcamt wcamt

*Adjust to 2013 dollars using cpi_urs
foreach var in `nom_lhs' {
	capture drop real_`var'
	gen r_`var' = (`var'/cpi_urs)
	_crcslbl r_`var' `var'
}


* race and ethinicity identifiers 
gen white = erace == 1 // white alone
gen black = erace == 2 // black alone
gen otherrace = (erace == 3 | erace == 4) // other race
gen hisp = eorigin == 1 // Hispanic dummy

gen white_nhisp = (white==1 & hisp==0)
gen white_hisp = (white==1 & hisp==1)
gen black_hisp = (black==1 & hisp==1)
gen black_nhisp = (black==1 & hisp==0)

la var white_nhisp "=1 if white, not Hispanic"
la var black_nhisp "=1 if black, not Hispanic"
la var white_hisp "=1 if white, Hispanic"
la var black_hisp "=1 if black, Hispanic"

** pia means for each of the groups

qui su mbr_ssdi_benefit_totamt_1 if group_bp1==1
di "mean: `r(mean)'; N: `r(N)'"
qui su mbr_ssdi_benefit_totamt_1 if group_bp2==1
di "mean: `r(mean)'; N: `r(N)'"
qui su mbr_ssdi_benefit_totamt_1 if group_bpf==1
di "mean: `r(mean)'; N: `r(N)'"
qui su mbr_ssdi_benefit_totamt_1 if all_ssdi==1
di "mean: `r(mean)'; N: `r(N)'"

qui su pia if group_bp1==1
di "mean: `r(mean)'; N: `r(N)'"
qui su pia if group_bp2==1
di "mean: `r(mean)'; N: `r(N)'"
qui su pia if group_bpf==1
di "mean: `r(mean)'; N: `r(N)'"
qui su pia if all_ssdi==1
di "mean: `r(mean)'; N: `r(N)'"


/* Now it's time to calculate my summary stats. I'm going to do this by group and by year so that I'm finding, for example, the average total expenditures in the last quarter in 1999 just among households within the bandwidth at the lower bend point in 1999. 
I'm going to save each of those statistics in a local and pull them later to put into an Excel sheet. */

gen married = ems == 1 | ems == 2 // married, spouse present or absent
la var married "Married with spouse present or absent"

gen ever_married = inrange(ems, 1, 5) // ever married
label variable ever_married "Ever married"

gen nkids = tfrchl if esex == 1 // the number of children ever born to a person (i.e. count of biological children)
replace nkids = tmomchl if esex == 2
la var nkids "number of kids"

replace nkids2 = . if nkids2 == 0

gen fs_ind = rcutyp27 == 1 & ar27 == 0
la var fs_ind "=1 if received food stamps"

*fraction with no earnings
gen no_earn = (inlist(totearn, 0, .)) // note that totearn is never missing (imputed if missing)
la var no_earn "=1 if no earnings"

/* create variable for fraction of DI subgroup with no non-DI income, calculated by checking if any source of income in the dataset is positive
Sources of income available in the SSB: earnings (totearn), food stamps (fsamt), AFDC assistance (afdcamt), veterans compensation (vcamt), 
	SSI (will use prior SSI determination), workers compensation (wcamt) */

loc inc_types totearn fsamt afdcamt vcamt wcamt

tabstat `inc_types', stat(min median mean max)

* convert amounts to indicators so we can use inlist
foreach var in `inc_types' {
	gen `var'_ind = .
	replace `var'_ind = 1 if `var'>0 & ~mi(`var')
}

tabstat totearn_ind fsamt_ind afdcamt_ind vcamt_ind wcamt_ind, stat(sum)

** earned income, 
gen inc_allDI = all_ssdi // restricts to just the DI population
replace inc_allDI = 0 if inlist(1, totearn_ind, fsamt_ind, afdcamt_ind, vcamt_ind, wcamt_ind, ssi) & ~mi(all_ssdi) // for DI recips, set to zero if also getting other income
la var inc_allDI "=1 if on DI and no non-DI income"

gen male = esex == 1

*variable labels for the summary stats
la var male "Male"
la var sipp_beg_age "Age at panel start"
la var white "White"
la var black "Black"
la var hisp "Hispanic"
la var otherrace "Other Race"
la var nohs "No HS deg."
la var hsdeg "HS deg."
la var somecoll "Attended college, no deg."
la var colldeg "College deg."
la var graddeg "Graduate deg."
la var r_totinc "Total Income, First Month"
la var r_totearn "Total Earnings, First Month"
la var r_totearn_ser "Total Yearly Earnings, First Year"
la var r_total_der_fica "Total Earnings, FICA jobs, 1st year"
la var r_total_der_nonfica "Total Earnings, non-FICA jobs, 1st year"
la var db_pension "Defined Benefit Pension Plan"
la var dc_pension "Defined Contrib. Pension Plan"
la var pension_in_scope_empl "In Scope for Pension"
la var life_ins_1 "Has Life Insurance"
la var r_fsamt "Food Stamps Amt"
la var hicov "Has Health Insurance"
la var hiemp "Has HI from employer"
la var r_afdcamt "AFDC Amt"


*add afdc indicator
gen afdc_ind =  rcutyp20 == 1 // receiving AFDC/TANF
la var afdc_ind "=1 if receives AFDC"

** make sure initwgt is person weight. 
rename wpfinwgt initwgt // person weight, but it's likely not right. 

gen only_di_inc_hh = thtotinc == thsocsec
label var only_di_inc_hh "Households with only DI income"
//  thtotinc == thsocsec // households with only DI income
// t01amta == tptotinc // individual with only DI income
tab only_di_inc_hh if pia != ., missing

gen only_di_inc_p = t01amta == tptotinc
label var only_di_inc_p "Person with only DI income"

tab only_di_inc_p if pia != ., missing

tab inc_allDI only_di_inc_hh if pia != ., missing

gen wkid = nkids >= 1 & nkids < .
tab wkid, missing
tab hasdep if pia != ., missing

save "data/1996_to_2008_table_a2.dta", replace

/*******************************************************************************
** SUMMARY STATISTICS
** without weights
********************************************************************************/

use "data/1996_to_2008_table_a2.dta", clear

sort fam_id person_id

local dems married ever_married nkids2 nohs 
local real_lhs r_totinc r_com_spousal_inc fs_ind r_fsamt afdc_ind r_afdcamt inc_allDI

** median and mean by each group
** lower bend point
tabstat `dems' `real_lhs' if group_bp1 == 1, stat(median mean) form(%15.3fc) save
tabstatmat LB // tabstatmat picks up matrices saved after tabstat. 
mat LB = LB' // transpose

** family maximum bend point
tabstat `dems' `real_lhs' if group_bpf == 1, stat(median mean) form(%15.3fc) save
tabstatmat FMB 
mat FMB = FMB' 

** upper bend point
tabstat `dems' `real_lhs' if group_bp2 == 1, stat(median mean) form(%15.3fc) save
tabstatmat UB 
mat UB = UB' 

** all respondents receiving DI income
tabstat `dems' `real_lhs' if all_ssdi == 1, stat(median mean) form(%15.3fc) save
tabstatmat DI 
mat DI = DI' 

** Horizontal concatenation
mat combined = LB, FMB, UB, DI

** number of observations in each group. save in separate matrices for exporting
foreach var in group_bp1 group_bpf group_bp2 all_ssdi {
	
	tabstat `var' if pia != ., stat(sum) save
	tabstatmat `var'_mat
}


/*******************************************************************************
** export to excel
*******************************************************************************/


local template "footnote19"
local templatepath "posted/`template'.xlsx"
local filepath "posted/`template'_${version}_${current_date}.xlsx"

copy "`templatepath'" "`filepath'", replace   // "copy A B, replace" will create a copy of the template

putexcel set  "`filepath'", modify sheet("raw")
putexcel B18 = (`"`c(current_date)' `c(current_time)'"') ///
		 C4 = mat(combined)                 ///
		 C16 = mat(group_bp1_mat)           ///
		 E16 = mat(group_bpf_mat)			///
		 G16 = mat(group_bp2_mat)			///
		 I16 = mat(all_ssdi_mat)

		 
/********************************************************************************
** End of File
********************************************************************************/


